0.1 Load Dataset

input_file <- "CD_additional_balanced.csv"

work_dir <- getwd()
setwd(work_dir)

cd_data <- read.csv(file = input_file, stringsAsFactors = F)

0.2 Inspect and factor dataset

cd_dt <- data.table(cd_data)

factor_cols <- c("job", "marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week", "poutcome", "y")
for (col in factor_cols)
  cd_dt[, (col) := as.factor(cd_dt[[col]])]

str(cd_dt)
## Classes 'data.table' and 'data.frame':   9280 obs. of  21 variables:
##  $ age           : int  41 49 49 41 45 42 39 28 44 42 ...
##  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 2 3 10 10 2 2 4 12 8 10 ...
##  $ marital       : Factor w/ 4 levels "divorced","married",..: 1 2 2 2 2 2 2 3 2 2 ...
##  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 7 3 6 3 3 3 8 4 6 ...
##  $ default       : Factor w/ 2 levels "no","unknown": 2 2 1 2 2 1 1 2 1 1 ...
##  $ housing       : Factor w/ 3 levels "no","unknown",..: 3 3 1 3 3 3 3 3 3 1 ...
##  $ loan          : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 1 3 1 3 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 4 4 4 ...
##  $ duration      : int  1575 1042 1467 579 461 673 935 1201 1030 1623 ...
##  $ campaign      : int  1 1 1 1 1 2 3 1 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ emp.var.rate  : num  1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons.price.idx: num  94 94 94 94 94 ...
##  $ cons.conf.idx : num  -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num  4.86 4.86 4.86 4.86 4.86 ...
##  $ nr.employed   : num  5191 5191 5191 5191 5191 ...
##  $ y             : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  - attr(*, ".internal.selfref")=<externalptr>
summary(cd_dt)
##       age                job           marital                   education   
##  Min.   :17.0   admin.     :2517   divorced:1021   university.degree  :3007  
##  1st Qu.:31.0   blue-collar:1769   married :5338   high.school        :2102  
##  Median :38.0   technician :1459   single  :2900   professional.course:1190  
##  Mean   :40.4   services   : 773   unknown :  21   basic.9y           :1177  
##  3rd Qu.:48.0   management : 651                   basic.4y           : 895  
##  Max.   :98.0   retired    : 595                   basic.6y           : 458  
##                 (Other)    :1516                   (Other)            : 451  
##     default        housing          loan           contact         month     
##  no     :7824   no     :4104   no     :7688   cellular :6672   may    :2533  
##  unknown:1456   unknown: 225   unknown: 225   telephone:2608   jul    :1477  
##                 yes    :4951   yes    :1367                    aug    :1353  
##                                                                jun    :1169  
##                                                                nov    : 886  
##                                                                apr    : 785  
##                                                                (Other):1077  
##  day_of_week    duration         campaign          pdays          previous     
##  fri:1763    Min.   :   1.0   Min.   : 1.000   Min.   :  0.0   Min.   :0.0000  
##  mon:1846    1st Qu.: 145.0   1st Qu.: 1.000   1st Qu.:999.0   1st Qu.:0.0000  
##  thu:2000    Median : 265.0   Median : 2.000   Median :999.0   Median :0.0000  
##  tue:1810    Mean   : 387.4   Mean   : 2.333   Mean   :887.3   Mean   :0.3153  
##  wed:1861    3rd Qu.: 528.0   3rd Qu.: 3.000   3rd Qu.:999.0   3rd Qu.:0.0000  
##              Max.   :4199.0   Max.   :39.000   Max.   :999.0   Max.   :6.0000  
##                                                                                
##         poutcome     emp.var.rate     cons.price.idx  cons.conf.idx   
##  failure    :1074   Min.   :-3.4000   Min.   :92.20   Min.   :-50.80  
##  nonexistent:7244   1st Qu.:-1.8000   1st Qu.:92.89   1st Qu.:-42.70  
##  success    : 962   Median :-0.1000   Median :93.44   Median :-41.80  
##                     Mean   :-0.4963   Mean   :93.48   Mean   :-40.22  
##                     3rd Qu.: 1.4000   3rd Qu.:93.99   3rd Qu.:-36.40  
##                     Max.   : 1.4000   Max.   :94.77   Max.   :-26.90  
##                                                                       
##    euribor3m      nr.employed     y       
##  Min.   :0.634   Min.   :4964   no :4640  
##  1st Qu.:1.244   1st Qu.:5076   yes:4640  
##  Median :4.021   Median :5191             
##  Mean   :2.960   Mean   :5135             
##  3rd Qu.:4.959   3rd Qu.:5228             
##  Max.   :5.045   Max.   :5228             
## 

0.3 Histrograms: Age, Duration, Campaign, Pdays

hist(cd_dt$age, main = "Histogram of Age", xlab = "Age")

boxplot(cd_dt$age, main = "Boxplot of Age", ylab = "Age")

quantile(cd_dt$age, seq(from = 0, to = 1, by = 0.10))
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##   17   27   30   33   35   38   41   46   51   57   98
hist(cd_dt$duration, main = "Histogram of Duration", xlab = "Duration")

boxplot(cd_dt$duration, main = "Boxplot of Duration", ylab = "Duration")

quantile(cd_dt$duration, seq(from = 0, to = 1, by = 0.10))
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##    1   80  124  167  211  265  340  452  615  860 4199
hist(cd_dt$campaign, main = "Histogram of Campaign", xlab = "Campaign")

boxplot(cd_dt$campaign, main = "Boxplot of Campaign", ylab = "Campaign")

quantile(cd_dt$campaign, seq(from = 0, to = 1, by = 0.10))
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##    1    1    1    1    1    2    2    2    3    4   39
hist(cd_dt$pdays, main = "Histogram of Pdays", xlab = "Pdays")

boxplot(cd_dt$pdays, main = "Boxplot of Pdays", ylab = "Pdays")

quantile(cd_dt$pdays, seq(from = 0, to = 1, by = 0.10))
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##    0   11  999  999  999  999  999  999  999  999  999

0.4 CD subscription (y), job, education, poutcome

(job_table <- table(cd_dt$job))
## 
##        admin.   blue-collar  entrepreneur     housemaid    management 
##          2517          1769           308           216           651 
##       retired self-employed      services       student    technician 
##           595           306           773           358          1459 
##    unemployed       unknown 
##           248            80
job_prop_table <- prop.table(job_table)
round(job_prop_table, digits = 2)
## 
##        admin.   blue-collar  entrepreneur     housemaid    management 
##          0.27          0.19          0.03          0.02          0.07 
##       retired self-employed      services       student    technician 
##          0.06          0.03          0.08          0.04          0.16 
##    unemployed       unknown 
##          0.03          0.01
barplot(job_table, main = "Job")

(edu_table <- table(cd_dt$education))
## 
##            basic.4y            basic.6y            basic.9y         high.school 
##                 895                 458                1177                2102 
##          illiterate professional.course   university.degree             unknown 
##                   6                1190                3007                 445
edu_prop_table <- prop.table(edu_table)
round(edu_prop_table, digits = 2)
## 
##            basic.4y            basic.6y            basic.9y         high.school 
##                0.10                0.05                0.13                0.23 
##          illiterate professional.course   university.degree             unknown 
##                0.00                0.13                0.32                0.05
barplot(edu_table, main = "Education Level")

(poutcome_table <- table(cd_dt$poutcome))
## 
##     failure nonexistent     success 
##        1074        7244         962
poutcome_prop_table <- prop.table(poutcome_table)
round(poutcome_prop_table, digits = 2)
## 
##     failure nonexistent     success 
##        0.12        0.78        0.10
barplot(poutcome_table, main = "Previous Outcome")

(y_table <- table(cd_dt$y))
## 
##   no  yes 
## 4640 4640
y_prop_table <- prop.table(y_table)
round(y_prop_table, digits = 2)
## 
##  no yes 
## 0.5 0.5
barplot(y_table, main = "CD Subscribed")

0.5 Variable Relationships

rel_cols <- c("age", "duration", "campaign", "pdays", "euribor3m", "emp.var.rate", "nr.employed")
cor(cd_dt[, ..rel_cols])
##                       age    duration     campaign       pdays   euribor3m
## age           1.000000000 -0.02072651  0.003690016 -0.05351616 -0.04462745
## duration     -0.020726510  1.00000000 -0.025872465  0.02893622  0.05733951
## campaign      0.003690016 -0.02587247  1.000000000  0.08930062  0.17512283
## pdays        -0.053516156  0.02893622  0.089300624  1.00000000  0.38773934
## euribor3m    -0.044627449  0.05733951  0.175122827  0.38773934  1.00000000
## emp.var.rate -0.049052629  0.07144035  0.185736186  0.33488799  0.95840218
## nr.employed  -0.074686516  0.05823209  0.176972215  0.47499217  0.94054583
##              emp.var.rate nr.employed
## age           -0.04905263 -0.07468652
## duration       0.07144035  0.05823209
## campaign       0.18573619  0.17697221
## pdays          0.33488799  0.47499217
## euribor3m      0.95840218  0.94054583
## emp.var.rate   1.00000000  0.86752989
## nr.employed    0.86752989  1.00000000
pairs.panels(cd_dt[, ..rel_cols])

boxplot(age ~ y, data = cd_dt)

aggregate(age ~ y, summary, data = cd_dt)
##     y age.Min. age.1st Qu. age.Median age.Mean age.3rd Qu. age.Max.
## 1  no 17.00000    32.00000   38.00000 39.89375    47.00000 88.00000
## 2 yes 17.00000    31.00000   37.00000 40.91315    50.00000 98.00000
boxplot(duration ~ y, data = cd_dt)

aggregate(duration ~ y, summary, data = cd_dt)
##     y duration.Min. duration.1st Qu. duration.Median duration.Mean
## 1  no        1.0000          94.0000        166.0000      221.5323
## 2 yes       37.0000         253.0000        449.0000      553.1912
##   duration.3rd Qu. duration.Max.
## 1         279.2500     1994.0000
## 2         741.2500     4199.0000
boxplot(campaign ~ y, data = cd_dt)

aggregate(campaign ~ y, summary, data = cd_dt)
##     y campaign.Min. campaign.1st Qu. campaign.Median campaign.Mean
## 1  no      1.000000         1.000000        2.000000      2.614871
## 2 yes      1.000000         1.000000        2.000000      2.051724
##   campaign.3rd Qu. campaign.Max.
## 1         3.000000     39.000000
## 2         2.000000     23.000000
boxplot(pdays ~ y, data = cd_dt)

aggregate(pdays ~ y, summary, data = cd_dt)
##     y pdays.Min. pdays.1st Qu. pdays.Median pdays.Mean pdays.3rd Qu. pdays.Max.
## 1  no     0.0000      999.0000     999.0000   982.5293      999.0000   999.0000
## 2 yes     0.0000      999.0000     999.0000   792.0356      999.0000   999.0000
boxplot(euribor3m ~ y, data = cd_dt)

aggregate(euribor3m ~ y, summary, data = cd_dt)
##     y euribor3m.Min. euribor3m.1st Qu. euribor3m.Median euribor3m.Mean
## 1  no       0.635000          1.405000         4.857000       3.797283
## 2 yes       0.634000          0.849000         1.266000       2.123135
##   euribor3m.3rd Qu. euribor3m.Max.
## 1          4.962000       4.970000
## 2          4.406000       5.045000
boxplot(emp.var.rate ~ y, data = cd_dt)

aggregate(emp.var.rate ~ y, summary, data = cd_dt)
##     y emp.var.rate.Min. emp.var.rate.1st Qu. emp.var.rate.Median
## 1  no        -3.4000000           -1.8000000           1.1000000
## 2 yes        -3.4000000           -1.8000000          -1.8000000
##   emp.var.rate.Mean emp.var.rate.3rd Qu. emp.var.rate.Max.
## 1         0.2409052            1.4000000         1.4000000
## 2        -1.2334483           -0.1000000         1.4000000
boxplot(nr.employed ~ y, data = cd_dt)

aggregate(nr.employed ~ y, summary, data = cd_dt)
##     y nr.employed.Min. nr.employed.1st Qu. nr.employed.Median nr.employed.Mean
## 1  no         4963.600            5099.100           5195.800         5175.497
## 2 yes         4963.600            5017.500           5099.100         5095.116
##   nr.employed.3rd Qu. nr.employed.Max.
## 1            5228.100         5228.100
## 2            5191.000         5228.100

0.6 Scatterplots

scatterplot3d(cd_dt$age, cd_dt$campaign, cd_dt$duration, highlight.3d = T, pch = as.numeric(cd_dt$y), main = "3D Scatterplot of CD data")
legend('topright', legend = levels(cd_dt$y),  col = 1:2, cex = 0.8, pch = 1:2)

scatterplot3d(cd_dt$nr.employed, cd_dt$euribor3m, cd_dt$duration, highlight.3d = T, pch = as.numeric(cd_dt$y), main = "3D Scatterplot of CD data")
legend('topright', legend = levels(cd_dt$y),  col = 1:2, cex = 0.8, pch = 1:2)